In [45]:
%matplotlib inline
import matplotlib.pyplot as plt
import sqlite3
import pandas as pd
import seaborn as sns
sns.set_style("white")
conn = sqlite3.connect('../data/output/database.sqlite')
c = conn.cursor()
def execute(sql):
'''
Executes a SQL command on the 'c' cursor and returns the results
'''
c.execute(sql)
return c.fetchall()
def printByYear(data):
'''
Given a list of tuples with (year, data), prints the data next to corresponding year
'''
for datum in data:
print "{0}: {1}".format(datum[0], datum[1])
In [39]:
years = {1996:1.46, 1997:1.43, 1998:1.4, 1999:1.38, 2000:1.33, 2001:1.3, 2002:1.28, 2003:1.25,
2004:1.22, 2005:1.18, 2006:1.14, 2007:1.11, 2008:1.07, 2009:1.07, 2010:1.05, 2011:1.02, 2012:1}
def adjustForInflation(value, year):
if value == None:
return
return value * years[year]
Time series of different schools to show change over time
Animated dot plot that shows change over time with animation
Currently seems like time series is the best option. Start with that and expand. Features to add or explore?:
In [2]:
# Median loan debt of those who graduate programs
debt = execute("""SELECT year, grad_debt_mdn
FROM Scorecard
WHERE year%2=0 AND grad_debt_mdn IS NOT NULL""")
In [12]:
def graphDistForYears(data, year1, year2):
data1 = pd.DataFrame()
data2 = pd.DataFrame()
# 1.4 is adjusting for inflation
data1['x'] = [1.4 * row[1] for row in data if row[0]==year1 and (isinstance(row[1], float) or isinstance(row[1], int))]
data2['x'] = [row[1] for row in data if row[0]==year2 and (isinstance(row[1], float) or isinstance(row[1], int))]
sns.distplot(data1, kde=False)
sns.distplot(data2, kde=False)
In [13]:
# Years with meadian student debt
from sets import Set
years = Set()
for row in debt:
years.add(row[0])
print years
graphDistForYears(debt, 1998, 2012)
In [6]:
# Graph makes it look like there mgith be anomalies in the data such as negative debt, but
# this is just a quirk of the regression and not an actual fact
print len([row[1] for row in debt if isinstance(row[1], float) and row[1] < 0])
In [7]:
# Net tuition revenue per student for a institution
tuitionRev = execute("""SELECT year, tuitfte, instnm, ugds
FROM Scorecard
WHERE tuitfte IS NOT NULL and main='Main campus' and ugds>1000""")
In [14]:
graphDistForYears(tuitionRev, 1998, 2012)
In [9]:
top10 = [[0,0,0] for i in range(0, 10)]
for inst in [row for row in tuitionRev if row[0]==1998]:
i = 0
done = False
while i < len(top10) and not done:
if (top10[i][1] < inst[1]):
top10[i][0] = inst[2]
top10[i][1] = inst[1]
top10[i][2] = inst[3]
done = True
i += 1
for top in top10:
print "{0} -- {1} -- {2}".format(top[0], top[1], top[2])
Odd outliers throughout needs to be explored and cleaned more
In [10]:
expenses = execute("""SELECT year, inexpfte
FROM Scorecard
WHERE inexpfte IS NOT NULL and main='Main campus' and ugds>1000""")
In [15]:
graphDistForYears(expenses, 1998, 2012)
In [27]:
purdueData = execute("""SELECT year, tuitfte, inexpfte, grad_debt_mdn
FROM Scorecard
WHERE instnm='PURDUE UNIVERSITY-MAIN CAMPUS'""")
In [28]:
purdueData[:10]
Out[28]:
In [35]:
def graphPurdueData(index):
df = pd.DataFrame()
df['Dollars'] = [adjustForInflation(row[index], row[0]) for row in purdueData]
df['Year'] = [row[0] for row in purdueData]
graph = sns.regplot('Year', 'Dollars',
data=df,
fit_reg=False)
In [40]:
graphPurdueData(1)
In [41]:
graphPurdueData(2)
In [44]:
graphPurdueData(3)
Problems noted so far:
To clean up:
Perhaps:
In [ ]: